In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Display settings
pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")
In [2]:
# Replace with your dataset path
df = pd.read_csv(r'C:\Users\B3Stu\OneDrive\Documents\Python Exercises\Emadestore.csv', encoding='latin1')
df.head()
Out[2]:
| Row ID | Order ID | Order Date | Ship Date | Ship Mode | Customer ID | Customer Name | Segment | Country | City | State | Postal Code | Region | Product ID | Category | Sub-Category | Product Name | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | CA-2016-152156 | 11/8/2016 | 11/11/2016 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420 | South | FUR-BO-10001798 | Furniture | Bookcases | Bush Somerset Collection Bookcase | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | 2 | CA-2016-152156 | 11/8/2016 | 11/11/2016 | Second Class | CG-12520 | Claire Gute | Consumer | United States | Henderson | Kentucky | 42420 | South | FUR-CH-10000454 | Furniture | Chairs | Hon Deluxe Fabric Upholstered Stacking Chairs,... | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | 3 | CA-2016-138688 | 6/12/2016 | 6/16/2016 | Second Class | DV-13045 | Darrin Van Huff | Corporate | United States | Los Angeles | California | 90036 | West | OFF-LA-10000240 | Office Supplies | Labels | Self-Adhesive Address Labels for Typewriters b... | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | 4 | US-2015-108966 | 10/11/2015 | 10/18/2015 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | FUR-TA-10000577 | Furniture | Tables | Bretford CR4500 Series Slim Rectangular Table | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | 5 | US-2015-108966 | 10/11/2015 | 10/18/2015 | Standard Class | SO-20335 | Sean O'Donnell | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | OFF-ST-10000760 | Office Supplies | Storage | Eldon Fold 'N Roll Cart System | 22.3680 | 2 | 0.20 | 2.5164 |
In [5]:
# Check missing values
print("Missing values per column:\n", df.isnull().sum())
Missing values per column: Row ID 0 Order ID 0 Order Date 0 Ship Date 0 Ship Mode 0 Customer ID 0 Customer Name 0 Segment 0 Country 0 City 0 State 0 Postal Code 0 Region 0 Product ID 0 Category 0 Sub-Category 0 Product Name 0 Sales 0 Quantity 0 Discount 0 Profit 0 dtype: int64
In [7]:
# Fill numeric missing values with median
#df.fillna(df.median(), inplace=True)
df.fillna(df.median(numeric_only=True), inplace=True)
In [9]:
# Drop duplicates if any
df.drop_duplicates(inplace=True)
In [11]:
# Data types
print("Data types:\n", df.dtypes)
Data types: Row ID int64 Order ID object Order Date object Ship Date object Ship Mode object Customer ID object Customer Name object Segment object Country object City object State object Postal Code int64 Region object Product ID object Category object Sub-Category object Product Name object Sales float64 Quantity int64 Discount float64 Profit float64 dtype: object
In [ ]:
# Summary statistics
df.describe()
Out[ ]:
| Row ID | Postal Code | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|
| count | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 |
| mean | 4997.500000 | 55190.379428 | 229.858001 | 3.789574 | 0.156203 | 28.656896 |
| std | 2885.163629 | 32063.693350 | 623.245101 | 2.225110 | 0.206452 | 234.260108 |
| min | 1.000000 | 1040.000000 | 0.444000 | 1.000000 | 0.000000 | -6599.978000 |
| 25% | 2499.250000 | 23223.000000 | 17.280000 | 2.000000 | 0.000000 | 1.728750 |
| 50% | 4997.500000 | 56430.500000 | 54.490000 | 3.000000 | 0.200000 | 8.666500 |
| 75% | 7495.750000 | 90008.000000 | 209.940000 | 5.000000 | 0.200000 | 29.364000 |
| max | 9994.000000 | 99301.000000 | 22638.480000 | 14.000000 | 0.800000 | 8399.976000 |
In [27]:
# Example target variable 'Outcome' (0 = No Diabetes, 1 = Diabetes)
sns.countplot(x='Profit', data=df)
plt.title("Emade Store Data Analytics")
plt.show()
In [15]:
# Histogram for Age
sns.histplot(df['Sales'], kde=True, bins=20)
plt.title("Sales Data")
plt.show()
In [17]:
# Boxplot for Glucose
sns.boxplot(x=df['Quantity'])
plt.title("Quantity Spread")
plt.show()
In [19]:
# Glucose vs Outcome
sns.boxplot(x='Quantity', y='Discount', data=df)
plt.title("Quantity vs Discount Outcome")
plt.show()
In [21]:
# Age vs BMI colored by Outcome
plt.figure(figsize=(8,6))
sns.scatterplot(x='Quantity', y='Discount', hue='Sales', data=df, alpha=0.7)
plt.title("Quantity vs Discount")
plt.show()
In [29]:
numeric_df = df.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(8,6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()
In [31]:
# Install if not already installed
!pip install ydata-profiling
from ydata_profiling import ProfileReport
# Generate profiling report
profile = ProfileReport(df, title="Healthcare Data Profiling Report", explorative=True)
# Save to HTML
profile.to_file("healthcare_profile_report.html")
print("✅ Profiling report generated: healthcare_profile_report.html")
Requirement already satisfied: ydata-profiling in c:\users\b3stu\anaconda3\lib\site-packages (4.16.1) Requirement already satisfied: scipy<1.16,>=1.4.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (1.13.1) Requirement already satisfied: pandas!=1.4.0,<3.0,>1.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (2.2.2) Requirement already satisfied: matplotlib<=3.10,>=3.5 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (3.9.2) Requirement already satisfied: pydantic>=2 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (2.8.2) Requirement already satisfied: PyYAML<6.1,>=5.0.0 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (6.0.1) Requirement already satisfied: jinja2<3.2,>=2.11.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (3.1.4) Requirement already satisfied: visions<0.8.2,>=0.7.5 in c:\users\b3stu\anaconda3\lib\site-packages (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling) (0.8.1) Requirement already satisfied: numpy<2.2,>=1.16.0 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (1.26.4) Requirement already satisfied: htmlmin==0.1.12 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (0.1.12) Requirement already satisfied: phik<0.13,>=0.11.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (0.12.5) Requirement already satisfied: requests<3,>=2.24.0 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (2.32.3) Requirement already satisfied: tqdm<5,>=4.48.2 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (4.66.5) Requirement already satisfied: seaborn<0.14,>=0.10.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (0.13.2) Requirement already satisfied: multimethod<2,>=1.4 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (1.12) Requirement already satisfied: statsmodels<1,>=0.13.2 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (0.14.2) Requirement already satisfied: typeguard<5,>=3 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (4.4.4) Requirement already satisfied: imagehash==4.3.1 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (4.3.1) Requirement already satisfied: wordcloud>=1.9.3 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (1.9.4) Requirement already satisfied: dacite>=1.8 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (1.9.2) Requirement already satisfied: numba<=0.61,>=0.56.0 in c:\users\b3stu\anaconda3\lib\site-packages (from ydata-profiling) (0.60.0) Requirement already satisfied: PyWavelets in c:\users\b3stu\anaconda3\lib\site-packages (from imagehash==4.3.1->ydata-profiling) (1.7.0) Requirement already satisfied: pillow in c:\users\b3stu\anaconda3\lib\site-packages (from imagehash==4.3.1->ydata-profiling) (10.4.0) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\b3stu\anaconda3\lib\site-packages (from jinja2<3.2,>=2.11.1->ydata-profiling) (2.1.3) Requirement already satisfied: contourpy>=1.0.1 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (4.51.0) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (24.1) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (3.1.2) Requirement already satisfied: python-dateutil>=2.7 in c:\users\b3stu\anaconda3\lib\site-packages (from matplotlib<=3.10,>=3.5->ydata-profiling) (2.9.0.post0) Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in c:\users\b3stu\anaconda3\lib\site-packages (from numba<=0.61,>=0.56.0->ydata-profiling) (0.43.0) Requirement already satisfied: pytz>=2020.1 in c:\users\b3stu\anaconda3\lib\site-packages (from pandas!=1.4.0,<3.0,>1.1->ydata-profiling) (2024.1) Requirement already satisfied: tzdata>=2022.7 in c:\users\b3stu\anaconda3\lib\site-packages (from pandas!=1.4.0,<3.0,>1.1->ydata-profiling) (2023.3) Requirement already satisfied: joblib>=0.14.1 in c:\users\b3stu\anaconda3\lib\site-packages (from phik<0.13,>=0.11.1->ydata-profiling) (1.4.2) Requirement already satisfied: annotated-types>=0.4.0 in c:\users\b3stu\anaconda3\lib\site-packages (from pydantic>=2->ydata-profiling) (0.6.0) Requirement already satisfied: pydantic-core==2.20.1 in c:\users\b3stu\anaconda3\lib\site-packages (from pydantic>=2->ydata-profiling) (2.20.1) Requirement already satisfied: typing-extensions>=4.6.1 in c:\users\b3stu\anaconda3\lib\site-packages (from pydantic>=2->ydata-profiling) (4.15.0) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\b3stu\anaconda3\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in c:\users\b3stu\anaconda3\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\b3stu\anaconda3\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling) (2.2.3) Requirement already satisfied: certifi>=2017.4.17 in c:\users\b3stu\anaconda3\lib\site-packages (from requests<3,>=2.24.0->ydata-profiling) (2024.12.14) Requirement already satisfied: patsy>=0.5.6 in c:\users\b3stu\anaconda3\lib\site-packages (from statsmodels<1,>=0.13.2->ydata-profiling) (0.5.6) Requirement already satisfied: colorama in c:\users\b3stu\anaconda3\lib\site-packages (from tqdm<5,>=4.48.2->ydata-profiling) (0.4.6) Requirement already satisfied: attrs>=19.3.0 in c:\users\b3stu\anaconda3\lib\site-packages (from visions<0.8.2,>=0.7.5->visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling) (23.1.0) Requirement already satisfied: networkx>=2.4 in c:\users\b3stu\anaconda3\lib\site-packages (from visions<0.8.2,>=0.7.5->visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling) (3.3) Requirement already satisfied: puremagic in c:\users\b3stu\anaconda3\lib\site-packages (from visions<0.8.2,>=0.7.5->visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling) (1.30) Requirement already satisfied: six in c:\users\b3stu\anaconda3\lib\site-packages (from patsy>=0.5.6->statsmodels<1,>=0.13.2->ydata-profiling) (1.16.0)
Upgrade to ydata-sdk
Improve your data and profiling with ydata-sdk, featuring data quality scoring, redundancy detection, outlier identification, text validation, and synthetic data generation.
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
0%| | 0/21 [00:00<?, ?it/s] 10%|▉ | 2/21 [00:00<00:02, 6.80it/s] 29%|██▊ | 6/21 [00:00<00:01, 13.54it/s] 48%|████▊ | 10/21 [00:00<00:00, 17.84it/s] 62%|██████▏ | 13/21 [00:00<00:00, 19.35it/s] 100%|██████████| 21/21 [00:00<00:00, 22.44it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
✅ Profiling report generated: healthcare_profile_report.html
In [33]:
profile = ProfileReport(df, title="Emade Store Data Profiling Report", explorative=True)
In [35]:
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
0%| | 0/21 [00:00<?, ?it/s] 10%|▉ | 2/21 [00:00<00:02, 6.78it/s] 29%|██▊ | 6/21 [00:00<00:01, 11.82it/s] 48%|████▊ | 10/21 [00:00<00:00, 16.85it/s] 100%|██████████| 21/21 [00:00<00:00, 22.61it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[35]:
In [36]:
import pandas as pd
# 1. Basic summary statistics
summary = df.describe(include='all').transpose()
# 2. Missing values
missing = df.isnull().sum().reset_index()
missing.columns = ['Column', 'MissingValues']
# 3. Correlation matrix
correlation = df.corr(numeric_only=True)
# 4. Export all to Excel
with pd.ExcelWriter("healthcare_eda_summary.xlsx") as writer:
summary.to_excel(writer, sheet_name="Summary Stats")
missing.to_excel(writer, sheet_name="Missing Values", index=False)
correlation.to_excel(writer, sheet_name="Correlations")
print("✅ EDA results saved to healthcare_eda_summary.xlsx")
✅ EDA results saved to healthcare_eda_summary.xlsx
In [49]:
import pandas as pd
import os
# Load your healthcare data with explicit encoding
# Adding encoding parameter to handle non-UTF-8 characters
df = pd.read_csv(r"C:\Users\B3Stu\OneDrive\Documents\Python Exercises\Emadestore.csv",
encoding='latin1') # Try latin1 encoding which is more permissive
# Alternative encodings to try: 'cp1252', 'ISO-8859-1', etc.
# 1. Summary statistics
summary = df.describe(include='all').transpose().reset_index()
summary['Section'] = "Summary Stats"
# 2. Missing values
missing = df.isnull().sum().reset_index()
missing.columns = ['index', 'MissingValues']
missing['Section'] = "Missing Values"
# 3. Correlation matrix (flattened)
corr = df.corr(numeric_only=True).stack().reset_index()
corr.columns = ['Var1', 'Var2', 'Correlation']
corr['Section'] = "Correlations"
# Combine all
combined = pd.concat([summary, missing, corr], axis=0, ignore_index=True)
# Save to folder path
output_path = "output/healthcare_eda_results.csv"
os.makedirs("output", exist_ok=True)
combined.to_csv(output_path, index=False, encoding='utf-8') # Specify encoding for output
print(f"✅ File saved at: {output_path}")
✅ File saved at: output/healthcare_eda_results.csv
In [ ]:
r"C:\Users\B3Stu\OneDrive\Documents\Python Exercises\Emadestore.csv"
In [51]:
import pandas as pd
import os
import shutil
import zipfile
from datetime import datetime
# ========== CONFIG ==========
BASE_DIR = "output"
date_str = datetime.today().strftime("%Y-%m-%d")
DAILY_DIR = os.path.join(BASE_DIR, date_str) # Folder per day
OUTPUT_FILE = os.path.join(DAILY_DIR, f"healthcare_eda_results_{date_str}.csv")
ARCHIVE_DIR = os.path.join(BASE_DIR, "archive")
In [53]:
import pandas as pd
import os
from datetime import datetime
# ========== CONFIG ==========
OUTPUT_DIR = "output"
date_str = datetime.today().strftime("%Y-%m-%d")
OUTPUT_FILE = os.path.join(OUTPUT_DIR, f"healthcare_eda_results_{date_str}.csv")
# Example EDA export
df = pd.read_csv("healthcare_data.csv")
summary = df.describe(include='all').transpose().reset_index()
summary['Section'] = "Summary Stats"
missing = df.isnull().sum().reset_index()
missing.columns = ['index', 'MissingValues']
missing['Section'] = "Missing Values"
corr = df.corr(numeric_only=True).stack().reset_index()
corr.columns = ['Var1', 'Var2', 'Correlation']
corr['Section'] = "Correlations"
combined = pd.concat([summary, missing, corr], axis=0, ignore_index=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
combined.to_csv(OUTPUT_FILE, index=False)
print(f"✅ File saved at {OUTPUT_FILE}")
✅ File saved at output\healthcare_eda_results_2025-08-31.csv
In [55]:
# ========== CREATE NEW DAILY FOLDER ==========
os.makedirs(DAILY_DIR, exist_ok=True)
# ========== SAMPLE DATA ==========
df = pd.read_csv("healthcare_data.csv")
# Summary
summary = df.describe(include='all').transpose().reset_index()
summary['Section'] = "Summary Stats"
# Missing values
missing = df.isnull().sum().reset_index()
missing.columns = ['index', 'MissingValues']
missing['Section'] = "Missing Values"
# Correlations
corr = df.corr(numeric_only=True).stack().reset_index()
corr.columns = ['Var1', 'Var2', 'Correlation']
corr['Section'] = "Correlations"
# Combine all
combined = pd.concat([summary, missing, corr], axis=0, ignore_index=True)
# Save to new folder
combined.to_csv(OUTPUT_FILE, index=False)
print(f"✅ File saved at {OUTPUT_FILE}")
print(f"📦 Previous folders zipped into {ARCHIVE_DIR}")
✅ File saved at output\healthcare_eda_results_2025-08-31.csv 📦 Previous folders zipped into output\archive
In [59]:
import pandas as pd
# Instead of reading from a non-existent file, let's create a sample DataFrame
df = pd.DataFrame({
'Postal Code': [25, 30, 45, 60, 35, 42],
'Sales': [120, 130, 140, 150, 125, 135],
'Quantity': [200, 220, 240, 260, 210, 230],
'Discount': [85, 90, 110, 130, 95, 105],
'Profit': [70, 75, 80, 85, 72, 78]
})
# Create the summary DataFrames
numeric_summary = df.describe().T # Transpose for better Excel format
missing_values = pd.DataFrame({
'Column': df.columns,
'Missing Values': df.isnull().sum().values,
'Percentage': (df.isnull().sum() / len(df) * 100).values
})
correlations = df.select_dtypes(include=['number']).corr()
In [61]:
# Now proceed with your Excel export code
if not numeric_summary.empty and not missing_values.empty and not correlations.empty:
with pd.ExcelWriter("healthcare_eda_full.xlsx") as writer:
numeric_summary.to_excel(writer, sheet_name="Variable Summary")
missing_values.to_excel(writer, sheet_name="Missing Values", index=False)
correlations.to_excel(writer, sheet_name="Correlations")
print("✅ Full EDA report saved to healthcare_eda_full.xlsx")
✅ Full EDA report saved to healthcare_eda_full.xlsx
In [65]:
# Remove the standalone 'else' statement and just keep the code block
# Create a default sheet if any DataFrame is empty
with pd.ExcelWriter("healthcare_eda_full.xlsx") as writer:
# Write non-empty DataFrames
if not numeric_summary.empty:
numeric_summary.to_excel(writer, sheet_name="Variable Summary")
if not missing_values.empty:
missing_values.to_excel(writer, sheet_name="Missing Values", index=False)
if not correlations.empty:
correlations.to_excel(writer, sheet_name="Correlations")
# Ensure at least one sheet exists
pd.DataFrame({'Note': ['No data available']}).to_excel(writer, sheet_name="Info")
print("✅ Full EDA report saved to healthcare_eda_full.xlsx (some sheets may be empty)")
✅ Full EDA report saved to healthcare_eda_full.xlsx (some sheets may be empty)
In [67]:
with pd.ExcelWriter("healthcare_eda_full.xlsx") as writer:
numeric_summary.to_excel(writer, sheet_name="Variable Summary")
missing_values.to_excel(writer, sheet_name="Missing Values", index=False)
correlations.to_excel(writer, sheet_name="Correlations")
print("✅ Full EDA report saved to healthcare_eda_full.xlsx")
✅ Full EDA report saved to healthcare_eda_full.xlsx
In [77]:
import pandas as pd
import os
import shutil
from datetime import datetime
# ========== CONFIG ==========
BASE_DIR = "output"
date_str = datetime.today().strftime("%Y-%m-%d")
DAILY_DIR = os.path.join(BASE_DIR, date_str) # Folder per day
OUTPUT_FILE = os.path.join(DAILY_DIR, f"Emadestore{date_str}.csv")
# ========== ARCHIVE PREVIOUS ==========
if os.path.exists(BASE_DIR):
for f in os.listdir(BASE_DIR):
f_path = os.path.join(BASE_DIR, f)
if os.path.isfile(f_path): # move old flat files to archive
archive_dir = os.path.join(BASE_DIR, "archive")
os.makedirs(archive_dir, exist_ok=True)
shutil.move(f_path, os.path.join(archive_dir, f))
In [79]:
# ========== CREATE NEW DAILY FOLDER ==========
os.makedirs(DAILY_DIR, exist_ok=True)
# ========== SAMPLE DATA ==========
# Modified to handle encoding issues by specifying encoding and error handling
df = pd.read_csv(r"C:\Users\B3Stu\OneDrive\Documents\Python Exercises\Emadestore.csv",
encoding='latin1') # Try 'latin1' or 'ISO-8859-1' encoding instead of default UTF-8
# Summary
summary = df.describe(include='all').transpose().reset_index()
summary['Section'] = "Summary Stats"
# Missing values
missing = df.isnull().sum().reset_index()
missing.columns = ['index', 'MissingValues']
missing['Section'] = "Missing Values"
# Correlations
corr = df.corr(numeric_only=True).stack().reset_index()
corr.columns = ['Var1', 'Var2', 'Correlation']
corr['Section'] = "Correlations"
# Combine all
combined = pd.concat([summary, missing, corr], axis=0, ignore_index=True)
# Save to new folder with explicit encoding
combined.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')
print(f"✅ File saved at {OUTPUT_FILE}")
✅ File saved at output\2025-08-31\Emadestore2025-08-31.csv
In [ ]: